# load necessary libraries
library(Seurat)
library(dplyr)
library(ggplot2)
# library(CelltypeR)
Read in the flow data This data should be the gated live cells.
All samples need to be in one folder.
input_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/FlowDataFiles/9MBO"
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/"
# 1.a Read in FlowJo Files
# choose to downsample to 9000 where possible
# check this - I don't think it worked
flowset <- fsc_to_fs(input_path, downsample = 9000)
# down sample can be a number, 'none' or 'min'
# look at file names and rename with shorter sample names
sampleNames(flowset)
[1] "2020-03-06- export_bioinfo_3450c_live cells.fcs"
[2] "2020-03-06- export_bioinfo_AIW_live cells.fcs"
[3] "2020-03-06- export_bioinfo_AJG_live cells.fcs"
[4] "2020-03-17- export_bioinfo_old 3450c_live cells.fcs"
[5] "2020-03-17- export_bioinfo_old AIW_live cells.fcs"
[6] "2020-03-17- export_bioinfo_old AJG_live cells.fcs"
[7] "2020-03-17- export_bioinfo_young 3450c_live cells.fcs"
[8] "2020-03-17- export_bioinfo_young AIW_live cells.fcs"
[9] "2020-03-17- export_bioinfo_young AJG_live cells.fcs"
sampleNames(flowset) <- sampleNames(flowset) <- c("3450_0306","AIW002_0306","AJG001C_0306","3450_0317A","AIW002_0317A","AJG001C_0317A","3450_0317B","AIW002_0317B","AJG001C_0317B")
sampleNames(flowset)
[1] "3450_0306" "AIW002_0306" "AJG001C_0306" "3450_0317A"
[5] "AIW002_0317A" "AJG001C_0317A" "3450_0317B" "AIW002_0317B"
[9] "AJG001C_0317B"
# if we want to save the flowset object now we can
# this would be read back in with flowset
#
Harmonize data to remove batch or technical variation
This requires us to look and see where there are two peaks to align. We need to visualize the peaks of the transformed data before aligning.
# we can decided what level of processing to choose with the argument 'processing'
# biexp only applies a biexponential transformation
# align applies biexp transform and then aligns peaks
# retro (default), transforms, aligns and then reverse transforms
flowset_biexp <- harmonize(flowset, processing = 'biexp')
# we can visualize the peaks to see where there are two peaks for alignment
# we need to enter the column index for which peaks to align, the alignment for one or two peaks is not the same.
#plotdensity_flowset(flowset)
#plotdensity_flowset(flowset_biexp) # to see the peaks
flowset_align <- harmonize(flowset, processing = 'align',
two_peaks = c(7:20),
one_peak = c(1:6,21), threshold = 0.01)
Adjusting the distance between landmarks
.........
Adjusting the distance between landmarks
.........
flowset_retro <- harmonize(flowset, processing = 'retro',
two_peaks = c(7:20),
one_peak = c(1:6,21), threshold = 0.01)
Adjusting the distance between landmarks
.........
Adjusting the distance between landmarks
.........
df <- flowset_to_csv(flowset_retro)
Now we have made all the different processing of the fsc files. We can visualize the intensity in cell density plots to see the alignment
#plotdensity_flowset(flowset)
plotdensity_flowset(flowset_biexp)
Warning in melt(lapply(as.list(flowset@frames), function(x) { :
The melt generic in data.table has been passed a list and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(lapply(as.list(flowset@frames), function(x) { x = as.data.frame(x@exprs)})). In the next version, this warning will become an error.
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
Warning in geom_density_ridges(alpha = 0.4, verbose = FALSE) :
Ignoring unknown parameters: `verbose`
Picking joint bandwidth of 0.0468
Picking joint bandwidth of 0.0299
Picking joint bandwidth of 0.0164
Picking joint bandwidth of 0.082
Picking joint bandwidth of 0.0669
Picking joint bandwidth of 0.0172
Picking joint bandwidth of 0.818
Picking joint bandwidth of 0.149
Picking joint bandwidth of 0.722
Picking joint bandwidth of 0.862
Picking joint bandwidth of 0.255
Picking joint bandwidth of 0.24
Picking joint bandwidth of 0.157
Picking joint bandwidth of 0.381
Picking joint bandwidth of 0.614
Picking joint bandwidth of 0.729
Picking joint bandwidth of 0.564
Picking joint bandwidth of 0.241
Picking joint bandwidth of 0.754
Picking joint bandwidth of 0.661
Picking joint bandwidth of 0.102
plotdensity_flowset(flowset_align)
Warning in melt(lapply(as.list(flowset@frames), function(x) { :
The melt generic in data.table has been passed a list and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(lapply(as.list(flowset@frames), function(x) { x = as.data.frame(x@exprs)})). In the next version, this warning will become an error.
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
No id variables; using all as measure variables
Warning in geom_density_ridges(alpha = 0.4, verbose = FALSE) :
Ignoring unknown parameters: `verbose`
Picking joint bandwidth of 0.0468
Picking joint bandwidth of 0.0299
Picking joint bandwidth of 0.0164
Picking joint bandwidth of 0.082
Picking joint bandwidth of 0.0669
Picking joint bandwidth of 0.0172
Picking joint bandwidth of 0.819
Picking joint bandwidth of 0.194
Picking joint bandwidth of 0.724
Picking joint bandwidth of 0.863
Picking joint bandwidth of 0.301
Picking joint bandwidth of 0.273
Picking joint bandwidth of 0.156
Picking joint bandwidth of 0.39
Picking joint bandwidth of 0.616
Picking joint bandwidth of 0.742
Picking joint bandwidth of 0.561
Picking joint bandwidth of 0.241
Picking joint bandwidth of 0.763
Picking joint bandwidth of 0.657
Picking joint bandwidth of 0.102
#plotdensity_flowset(flowset_retro)
Now we will make test out clustering. For Seurat clustering and Phenograph we will make the Seurat object Flowsome takes in the dataframe directly.
# the function make_seu will take in the df of expression and Antibody/Marker list as a vector and create a seurat object. Values are scaled. Marker expression will be in the "RNA" slot. PCA is calculated using AB vector as the features
# make sure to always keep the same antibody order or your labels will not be correct
# antibody features in order to appear on the plots
AB <- c("CD24","CD56","CD29","CD15","CD184","CD133","CD71","CD44","GLAST","AQP4","HepaCAM", "CD140a","O4")
seu <- make_seu(df, AB_vector = AB)
Warning: Using an external vector in selections was deprecated in tidyselect
1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(AB_vector)
# Now:
data %>% select(all_of(AB_vector))
See
<https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning
was generated.
Centering and scaling data matrix
|
| | 0%
|
|============================================================| 100%
Warning in irlba(A = t(x = object), nv = npcs, ...) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
PC_ 1
Positive: CD44, CD184, CD24, CD56, CD15, CD133
Negative: CD140a, O4, CD29, CD71, HepaCAM, AQP4
PC_ 2
Positive: CD29, CD44, CD140a, CD184, CD71, O4
Negative: CD15, CD56, CD24, GLAST, CD133, AQP4
PC_ 3
Positive: CD56, CD29, CD133, CD24, AQP4, GLAST
Negative: CD44, CD184, CD140a, O4, HepaCAM, CD15
PC_ 4
Positive: CD133, AQP4, CD184, HepaCAM, CD71, GLAST
Negative: CD56, CD24, CD15, CD44, CD29, CD140a
PC_ 5
Positive: CD184, CD24, CD29, CD140a, O4, HepaCAM
Negative: CD44, CD133, CD56, CD71, AQP4, CD15
Save dataframe and seurat object for later
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/"
# to save the df for later
# write.csv(df, "pathway/filename.csv")
#write.csv(df, paste(output_path,"df9000Feb15.csv", sep = ""))
write.csv(df, paste(output_path,"df9000Feb24.csv", sep = ""))
# save the seurat object
saveRDS(seu, paste(output_path,"seu9000Feb24.RDS", sep = ""))
Read in the csv of the flow files processed and the seurat object
df.input <- read.csv("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/df9000Feb15.csv")
seu <- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/seu9000Feb15.RDS")
Test Flowsom
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/exp_clusters/flow/"
flowsom.test <- flowsom(input = seu, #seurat
df_input = df.input, #the processed df2 file before being converted to seurat
flow_k = c(3,5),
run.stats = TRUE,
run.plot = FALSE,
save_to = output_path)
Error in h(simpleError(msg, call)) :
error in evaluating the argument 'x' in selecting a method for function 'nrow': unused argument (where(is.numeric))
Explore cluster parameters
df2 <- df.input %>% dplyr::select(AB) # need to add this line into the main explore param function
Phenograph
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/exp_clusters/pheno/"
pheno.test <- phenograph(input = seu, df_input = df.input,
pheno_lou_kn = c(20,40),
run.stats = TRUE,
run.plot = TRUE,
save_to = output_path)
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Computing nearest neighbor graph
Computing SNN
11:41:14 UMAP embedding parameters a = 0.9922 b = 1.112
11:41:14 Read 197160 rows and found 12 numeric columns
11:41:14 Using Annoy for neighbor search, n_neighbors = 20
11:41:14 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:41:29 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//RtmpFmyVpn/file178c93f4deb41
11:41:29 Searching Annoy index using 1 thread, search_k = 2000
11:42:23 Annoy recall = 100%
11:42:24 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 20
11:42:31 Initializing from normalized Laplacian + noise (using irlba)
11:42:44 Commencing optimization for 200 epochs, with 5499496 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
11:44:32 Optimization finished
Run Rphenograph starts:
-Input data of 197160 rows and 13 columns
-k is set to 20
Finding nearest neighbors...DONE ~ 51.131 s
Compute jaccard coefficient between nearest-neighbor sets...DONE ~ 16.735 s
Build undirected graph from the weighted links...DONE ~ 19.493 s
Run louvain clustering on the graph ...DONE ~ 21.039 s
Run Rphenograph DONE, totally takes 108.397999999986s.
Return a community class
-Modularity value: 0.8848982
-Number of clusters: 42
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Error in select(., where(is.numeric)) :
unused argument (where(is.numeric))
Test functions Louvain
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/exp_clusters/seu/"
lou.test <- louvain(input = seu, #seu object
df_input = df.input,
pheno_lou_kn = c(20,40),
resolutions = c(0.2,0.5),
run.plot = FALSE, #option to save the graphs
run.stats = FALSE, #option to save stats list
save_to #only required when save is TRUE
)
Computing nearest neighbor graph
Computing SNN
12:58:17 UMAP embedding parameters a = 0.9922 b = 1.112
12:58:17 Read 197160 rows and found 12 numeric columns
12:58:17 Using Annoy for neighbor search, n_neighbors = 20
12:58:17 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
12:58:31 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//RtmpFmyVpn/file178c9321b124b
12:58:32 Searching Annoy index using 1 thread, search_k = 2000
12:59:25 Annoy recall = 100%
12:59:26 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 20
12:59:32 Initializing from normalized Laplacian + noise (using irlba)
12:59:44 Commencing optimization for 200 epochs, with 5499496 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
13:01:31 Optimization finished
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 197160
Number of edges: 5227227
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9166
Number of communities: 14
Elapsed time: 110 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 197160
Number of edges: 5227227
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8845
Number of communities: 22
Elapsed time: 140 seconds
Computing nearest neighbor graph
Computing SNN
13:07:10 UMAP embedding parameters a = 0.9922 b = 1.112
13:07:10 Read 197160 rows and found 12 numeric columns
13:07:10 Using Annoy for neighbor search, n_neighbors = 40
13:07:10 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
13:07:25 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//RtmpFmyVpn/file178c9f4e13a0
13:07:25 Searching Annoy index using 1 thread, search_k = 4000
13:08:56 Annoy recall = 100%
13:08:57 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 40
13:09:08 Initializing from normalized Laplacian + noise (using irlba)
13:09:32 Commencing optimization for 200 epochs, with 11184324 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
13:11:50 Optimization finished
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 197160
Number of edges: 11735573
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9134
Number of communities: 12
Elapsed time: 212 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 197160
Number of edges: 11735573
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8785
Number of communities: 21
Elapsed time: 197 seconds
Annotate clusters 1. Visualization for manual annotation. - output by clustering function 2. CAM (Correlation assignment model) - requires reference matrix 3. RFM (Random Forest Model) - requires annotated matching flow dataset 4. Seurat label transfer - requires annotated matching flow data in a seurat object
# run clustering with only desired conditions - use function - TEMP run directly
seu <- RunPCA(seu, features = AB)
Warning in irlba(A = t(x = object), nv = npcs, ...) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
Warning: Requested number is larger than the number of available items (13). Setting to 13.
PC_ 1
Positive: CD44, CD184, CD24, CD56, CD15, CD133
Negative: CD140a, O4, CD29, CD71, HepaCAM, AQP4
PC_ 2
Positive: CD29, CD44, CD140a, CD184, CD71, O4
Negative: CD15, CD56, CD24, GLAST, CD133, AQP4
PC_ 3
Positive: CD56, CD29, CD133, CD24, AQP4, GLAST
Negative: CD44, CD184, CD140a, O4, HepaCAM, CD15
PC_ 4
Positive: CD133, AQP4, CD184, HepaCAM, CD71, GLAST
Negative: CD56, CD24, CD15, CD44, CD29, CD140a
PC_ 5
Positive: CD184, CD24, CD29, CD140a, O4, HepaCAM
Negative: CD44, CD133, CD56, CD71, AQP4, CD15
seu <- FindNeighbors(seu, dims = 1:12, k.param = 60)
Computing nearest neighbor graph
Computing SNN
# must take one less than the number of antibodies
seu <- FindClusters(seu, resolution = 0.8)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 73578
Number of edges: 6276606
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8455
Number of communities: 22
Elapsed time: 54 seconds
seu <- RunUMAP(seu, dims = 1:12, n.neighbors = 46, min.dist = 0.4,
spread = 1.5)
16:49:43 UMAP embedding parameters a = 0.4502 b = 1.076
16:49:43 Read 73578 rows and found 12 numeric columns
16:49:43 Using Annoy for neighbor search, n_neighbors = 46
16:49:43 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:49:49 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//Rtmpriq53r/file731c697bec6f
16:49:49 Searching Annoy index using 1 thread, search_k = 4600
16:50:22 Annoy recall = 100%
16:50:23 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 46
16:50:29 Initializing from normalized Laplacian + noise (using irlba)
16:50:39 Commencing optimization for 200 epochs, with 4811862 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:51:33 Optimization finished
DimPlot(seu)
NA
NA
# save with graph
saveRDS(seu,"/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/seu9000Feb24.RDS")
Visualize expression on UMAP and with heat maps
# this will let us see one at at time
for (i in AB) {
print(FeaturePlot(seu, features = i, min.cutoff = 'q1', max.cutoff = 'q95', label = TRUE))
}
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Rasterizing points since number of points exceeds 100,000.
To disable this behavior set `raster=FALSE`
Some more visualization of expression values
# summary heat map
# use function plotmean
length(unique(seu$RNA_snn_res.0.8))
[1] 22
# there are 22 clusters
cluster.num <- c(0:21)
plotmean(plot_type = 'heatmap',seu = seu, group = 'RNA_snn_res.0.8', markers = AB,
var_names = cluster.num, slot = 'scale.data', xlab = "Cluster",
ylab = "Antibody")
NA
NA
output_path <- "/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/"
saveRDS(seu, paste(output_path,"cluster9000.RDS"))
Predict cell annotations with CAM (Corralations assignment method)
reference_path <- "/Users/rhalenathomas/GITHUB/CelltypeR/Data/ReferenceMatrix9celltypesOrdered.csv"
test_data <- read.csv("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/exp_clusters/df9000.csv")
reference_data <- read.csv(reference_path)
cor <- find_correlation(test_data,
reference_data,
min_corr = 0.5,
min_diff = 0.05)
# creates a dataframe with cor1 cor2 and predicted cell type label
Visualize the CAM results
plot_corr(cor)
Warning in melt(df) :
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(df). In the next version, this warning will become an error.
Using X, best.cell.type, second.cell.type, cell.label as id variables
Warning in melt(df.downsample) :
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(df.downsample). In the next version, this warning will become an error.
Using X, best.cell.type, second.cell.type, cell.label as id variables
Warning in melt(double.cells) :
The melt generic in data.table has been passed a data.frame and will attempt to redirect to the relevant reshape2 method; please note that reshape2 is deprecated, and this redirection is now deprecated as well. To continue using melt methods from reshape2 while both libraries are attached, e.g. melt.list, you can prepend the namespace like reshape2::melt(double.cells). In the next version, this warning will become an error.
Using X, best.cell.type, second.cell.type, cell.label as id variables
[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
Warning: Removed 35 rows containing missing values (`geom_line()`).
Warning: Removed 35 rows containing missing values (`geom_point()`).
Apply correlation predictions to clusters and output a vector for annotation functions
unique(seu$cor.labels)
[1] "unknown" "RG" "StemCell"
[4] "Oligo" "Neuron" "StemCell-Neuron"
[7] "OPC" "Endothelial" "Neuron-StemCell"
[10] "Endothelial-RG" "NPC" "Neuron-NPC"
[13] "NPC-StemCell" "RG-Endothelial" "RG-Astrocyte"
[16] "NPC-Neuron" "Astrocyte" "OPC-Neuron"
[19] "StemCell-NPC" "Neuron-OPC" "OPC-Oligo"
[22] "Astrocyte-RG" "Oligo-OPC" "Endothelial-Astrocyte"
[25] "OPC-StemCell" "StemCell-OPC" "Oligo-NPC"
[28] "NPC-OPC" "Oligo-Neuron" "NPC-Oligo"
[31] "OPC-NPC" "Oligo-StemCell" "StemCell-Oligo"
Run get annotations function to return a vector of annotation in the order of the clusters.
length(cor.ann$CAM)
[1] 22
Use a trained Random Forest model to predict cell types. Training of the Random Forest model with an annotated data set is below.
# you must have a saved trained model from a data object annotated from the same markers
rf <- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/PaperFigures/RFM_trained.11072022.Rds")
rfm.pred <- RFM_predict(seu, rf)
head(rfm.pred)
# add the predictions into the seurat object
seu <- AddMetaData(object=seu, metadata=rfm.pred$`predict(rf, df)`, col.name = 'rfm.labels')
# check that the data is added
table(seu$rfm.labels)
Get the annotation by cluster for the RFM
rfm.ann <- get_annotation(seu, seu$RNA_snn_res.0.8,seu$rfm.labels,
top_n = 3, filter_out = c("unknown","Unknown","Mixed","Mix"), Label = "RFM")
rfm.ann
#rfm.ann <- get_annotation(seu, seu$RNA_snn_res.0.8,seu$rfm.labels,
# top_n = 3, filter_out = FALSE, Label = "RFM")
rfm.ann
dim(rfm.ann)
Plot RFM predictions
plot_lab_clust(seu, seu.cluster = seu$RNA_snn_res.0.8, seu.labels = seu$rfm.labels, filter_out = c("unknown","Unknown","Mixed"))
Predicting cell types with Seurat label transfer using anchors
# takes in a seurat object with the labels added
# makes a dataframe with the count of predicted labels for each cluster
# input seurat object with the predicted labels in the meta data
# input the clusters meta data slot to be labels
# input the meta data slot with the labels (correlation, random forest, seurat predicted)
#need reference data object with labels
seu.r<- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/PaperFigures/Seu9000annot.08072021.RDS")
# the output is a seurat object with the predicted annotations
seu <- seurat_predict(seu, seu.r, ref_id = 'subgroups', down.sample = 500, markers = AB)
# plot the seurat anchor predictions
# get the annotation table for the seurat anchor predictions
plot_lab_clust(seu, seu$RNA_snn_res.0.8, seu$seu.pred)
# to not filter anything use c()
seu.ann <- get_annotation(seu, seu$RNA_snn_res.0.8,seu$seu.pred,
top_n = 3, filter_out = c(), Label = "Seurat")
seu.ann
Get a consensus of cluster annotations, Add the annotations to the seurat object
ann.list <- list(man.ann,df.cor.ann,rfm.ann,seu.ann)
# annotate the seurat object
seu <- cluster_annotate(seu, ann.list,
annotation_name ="CellType",
to_label = "RNA_snn_res.0.8")
DimPlot(seu, group.by = "CellType")
NA
NA
Just use the annotate functions
seu <- annotate(seu, annotations = man.ann$manual, to_label = "RNA_snn_res.0.8", annotation_name = "MyCellType")
DimPlot(seu, group.by = "MyCellType")
#save with the annotations
saveRDS(seu,"/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/seu9000Feb24.RDS")
Compare groups
seu <- readRDS("/Users/rhalenathomas/Documents/Data/FlowCytometry/PhenoID/Analysis/testingLibrary/seu9000Feb24.RDS")
Add the variables into the seurat object
Genotype <- c("3450","3450","3450","AIW002","AIW002","AIW002","AJG001C","AJG001C","AJG001C")
ExDate <- c("0306","0317","0317","0306","0317","0317","0306","0317","0317")
Batch <- c("B","B","A","B","B","A","B","B","A")
Age <- c("273","284","263","273","284","263","273","284","263")
# Genotype
Idents(seu) <- "Sample"
cluster.ids <- Genotype
# vector with the new names - you need this vector from me
names(cluster.ids) <- levels(seu) # get the levels
seu <- RenameIdents(seu, cluster.ids) # rename
seu$Genotype <- Idents(seu) # add a new dataslot
# Experiment date
Idents(seu) <- "Sample"
cluster.ids <- ExDate
# vector with the new names - you need this vector from me
names(cluster.ids) <- levels(seu) # get the levels
seu <- RenameIdents(seu, cluster.ids) # rename
seu$ExDate <- Idents(seu) # add a new dataslot
# Batch
Idents(seu) <- "Sample"
cluster.ids <- Batch
# vector with the new names - you need this vector from me
names(cluster.ids) <- levels(seu) # get the levels
seu <- RenameIdents(seu, cluster.ids) # rename
seu$Batch <- Idents(seu) # add a new dataslot
# days in final differentiation media
Idents(seu) <- "Sample"
cluster.ids <- Age
# vector with the new names - you need this vector from me
names(cluster.ids) <- levels(seu) # get the levels
seu <- RenameIdents(seu, cluster.ids) # rename
seu$DaysinCulture <- Idents(seu) # add a new dataslot
Plots some variables
# one plot
proportionplots(seu.q,seu.var = seu$Genotype, seu.lable = seu$CellType, groups = "Genotype")
[1] "Number of colours needed17"
[1] "Number of colours entered 1"
[1] "Default ggplot colours used"
# add colours
colours <- c("chocolate1","chocolate3","orange",
"lightsalmon", "pink","lightpink3",
"steelblue3","deepskyblue",
"plum3","purple",
"seagreen3","tomato4","burlywood3","grey90","brown",
"royalblue3", "tan4","yellowgreen")
proportionplots(seu.q,seu.var = seu$Genotype, seu.lable = seu$CellType, groups = "Genotype", my_colours = colours)
[1] "Number of colours needed17"
[1] "Number of colours entered 18"
[1] "Custome colours used."
var.list <- list(seu$DaysinCulture,seu$Batch,seu$ExDate,seu$Genotype)
varnames <- c("Days in Culture", "Batch", "Experiment Date", "Genotype")
# plot all the variables of interest at once
plotproportions(seu, var.list = var.list, xgroup = seu$CellType, varnames = varnames, my_colours = c("blue","red"))
[1] "Number of colours needed17"
[1] "Number of colours entered 2"
[1] "Default ggplot colours used"
[1] "Number of colours needed17"
[1] "Number of colours entered 2"
[1] "Default ggplot colours used"
[1] "Number of colours needed17"
[1] "Number of colours entered 2"
[1] "Default ggplot colours used"
[1] "Number of colours needed17"
[1] "Number of colours entered 2"
[1] "Default ggplot colours used"
Heatmaps
# make sure the order is correct
celltypes <- c("unknown","radial glia 1", "astrocytes 1", "mixed","neurons 1",
"neurons 2", "epithelial", "astrocytes mature", "npc", "radial glia 2",
"radial glia 3", "endothelial","neurons 3", "astrocytes 2",
"oligodendrocytes", "stem-like 1","neural stem")
plotmean(plot_type = 'heatmap',seu = seu, group = 'CellType', markers = AB,
var_names = celltypes, slot = 'scale.data', xlab = "Cell Type",
ylab = "Antibody")
NA
NA
# dot plot
og.order <- c("unknown","radial glia 1", "astrocytes 1", "mixed","neurons 1",
"neurons 2", "epithelial", "astrocytes mature", "npc", "radial glia 2",
"radial glia 3", "endothelial","neurons 3", "astrocytes 2",
"oligodendrocytes", "stem-like 1","neural stem")
# make sure the terms are exactly the same and you don't miss any
new.order <- c("astrocytes 1","astrocytes 2","astrocytes mature",
"endothelial","epithelial", "mixed","neurons 1",
"neurons 2","neurons 3","neural stem","npc",
"oligodendrocytes",
"radial glia 1","radial glia 2","radial glia 3","stem-like 1",
"unknown")
new.order <- rev(new.order)
AB.order <- c("CD24","CD56","CD29","CD15","CD184","CD133","CD71","CD44","GLAST","AQP4","HepaCAM", "CD140a","O4")
plotmean(plot_type = 'dotplot',seu = seu, group = 'CellType', markers = AB,
var_names = celltypes, slot = 'scale.data', xlab = "Cell Type",
ylab = "Antibody", var1order = new.order, var2order = AB.order)
NA
NA
Statistics comparing groups
# prepare a dataframe for stats
# this function takes the annotated seurat object with all the variables already existing as metadata slots
# check what meta data slots exist in your object
colnames(seu@meta.data)
[1] "orig.ident" "nCount_RNA" "nFeature_RNA" "Sample"
[5] "RNA_snn_res.0.8" "seurat_clusters" "cor.labels" "rfm.labels"
[9] "seu.pred" "CellType" "MyCellType" "Genotype"
[13] "ExDate" "Batch" "DaysinCulture"
Train Random Forest model Requires a labelled seurat object More cells will give higher accuracy but increase computation time. Run in HPC with lots of cells
rf <- RFM_train(seurate_object = seu,
AB_list = AB, annotations = seu$CellType3,
split = c(0.8,0.2),
downsample = 20000,
seed = 222,
mytry = c(1:10),
maxnodes = c(12: 25),
trees = c(250, 500, 1000,2000),
start_node = 15)
save(rf, output_path,"trainedRFMFeb14.Rds")